using System;
using System.Runtime.Intrinsics.X86;
using size_t = nuint;

public unsafe interface IMatrixMultiplier : IDisposable
{
	bool TryLargePage { get; set; }

	public void Multiply(double[,] prod, double[,] A, double[,] B)
	{
		static size_t RowN(double[,] M) => (size_t)M.GetLength(0);
		static size_t ColN(double[,] M) => (size_t)M.GetLength(1);

		size_t prod_rows = RowN(prod);
		size_t prod_cols = ColN(prod);
		size_t A_cols = ColN(A);

		if (prod_rows != RowN(A) || prod_cols != ColN(B) || A_cols != RowN(B)) {
			throw new ArgumentException();
		}

		fixed (double* prod_ptr = prod, A_ptr = A, B_ptr = B) {
			Multiply(false, prod_ptr, A_ptr, B_ptr,
				prod_rows, prod_cols, A_cols,
				prod_cols, A_cols, prod_cols);
		}
	}

	void Multiply(bool add, double* prod, double* A, double* B,
		size_t prod_rows, size_t prod_cols, size_t A_cols,
		size_t prod_stride, size_t A_stride, size_t B_stride);
}

public static class MatrixMultipliers
{
	public static IMatrixMultiplier Create(int threadNum = -1, bool tryLargePage = true)
	{
		return Fma.IsSupported ? CreateFma(threadNum, tryLargePage)
			 : Avx.IsSupported ? CreateAvx(threadNum, tryLargePage)
			 : Sse2.IsSupported ? CreateSse2(threadNum, tryLargePage)
			 : CreateVanilla(threadNum, tryLargePage);
	}

	public static IMatrixMultiplier CreateFma(int threadNum = -1, bool tryLargePage = true)
		=> Create<MatrixMulFma>(threadNum, tryLargePage);

	public static IMatrixMultiplier CreateAvx(int threadNum = -1, bool tryLargePage = true)
		=> Create<MatrixMulAvx>(threadNum, tryLargePage);

	public static IMatrixMultiplier CreateSse2(int threadNum = -1, bool tryLargePage = true)
		=> Create<MatrixMulSse2>(threadNum, tryLargePage);

	public static IMatrixMultiplier CreateVanilla(int threadNum = -1, bool tryLargePage = true)
		=> Create<MatrixMulVanilla>(threadNum, tryLargePage);

	static IMatrixMultiplier Create<T>(int threadNum, bool tryLargePage) where T : IMatrixMulSimdSpecific, new()
	{
		if (threadNum < -1 || threadNum == 0)
			throw new ArgumentException();

		// IMatrixMultiplier mul = (threadNum == 1) ? new MatrixMulCommon<T>() : new ParallelMatrixMultiplier<MatrixMulCommon<T>>(threadNum);
		IMatrixMultiplier mul = null;
		if (threadNum == 1) mul = new MatrixMulCommon<T>();
		else mul = new ParallelMatrixMultiplier<MatrixMulCommon<T>>(threadNum);
		mul.TryLargePage = tryLargePage;
		return mul;
	}
}
